reveal.js styling¶

In [1]:
%%html
<style type="text/css">

.reveal div.highlight {
    margin: 0; 
}

.reveal div.highlight>pre {
    margin: 0; 
    width: 100%;
    font-size: 15px;
}

.reveal div.jp-OutputArea-output>pre {
    margin: 0; 
    width: 75%;
    font-size: var(--jp-code-font-size);
    box-shadow: none;
}

</style>

Data Visualization: Distinctive Words¶

Der Code basiert auf:

Karsdorp, Folgert / Kestemont, Mike / Riddel, Allen, Humanities Data Analysis. Case Studies with Python, Princeton University Press 2021, S. 19-30.

Link zum Code im Jupyter Book

Import der Python-Packages¶

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from collections import Counter

import random
import re
import string

%matplotlib inline

Einlesen der Daten in einen Dataframe¶

In [3]:
df = pd.read_csv('../data/220128-wutbuerger-tokenized.csv', 
                 parse_dates=['date'], encoding='utf8')
In [4]:
df.loc[:, 'year'] = df.loc[:, 'date'].dt.year

Dataframe inspizieren¶

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9149 entries, 0 to 9148
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   id                       9149 non-null   int64              
 1   date                     9149 non-null   datetime64[ns, UTC]
 2   tweet                    9149 non-null   object             
 3   hashtags                 9149 non-null   object             
 4   username                 9149 non-null   object             
 5   link                     9149 non-null   object             
 6   nretweets                9149 non-null   int64              
 7   nlikes                   9149 non-null   int64              
 8   nreplies                 9149 non-null   int64              
 9   nqoutes                  0 non-null      float64            
 10  char_per_url_free_tweet  9149 non-null   int64              
 11  tweet_clean              9149 non-null   object             
 12  tweet_clean_removed      9149 non-null   object             
 13  tokens                   9149 non-null   object             
 14  year                     9149 non-null   int64              
dtypes: datetime64[ns, UTC](1), float64(1), int64(6), object(7)
memory usage: 1.0+ MB

Helferfunktionen¶

In [6]:
# Count keywords

def count_keywords(tokens, keywords): 
    
    tokens = [t for t in tokens if t in keywords]
    counter = Counter(tokens)
    return [counter.get(k, 0) for k in keywords]

def count_keywords_by(df, by, keywords, column='tokens'):
    
    freq_matrix = df[column].str.split().apply(count_keywords, keywords=keywords) # tokens must be a list
    freq_df = pd.DataFrame.from_records(freq_matrix, columns=keywords)
    freq_df[by] = df[by] # copy the grouping column(s)
    
    return freq_df.groupby(by=by).sum().sort_values(by)
In [7]:
# function for counting words
# see https://gist.github.com/susanli2016/69ec5333e9846044abd74268eed9d85b#file-top_unigram-py

def get_top_n_words(corpus, n=None):

    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

    return words_freq[:n]

Distinctive Words: Betrachtung Gesamtzeitraum vor und nach 2015¶

Keywords zählen¶

In [8]:
keywords = get_top_n_words(df.loc[:,'tokens'], 1000)
In [9]:
keywords = (list(list(zip(*keywords))[0]))
In [10]:
df_counts = count_keywords_by(df, by='year', keywords=keywords)
In [11]:
df_counts.head(5).T
Out[11]:
year 2010 2011 2012 2013 2014
wutbürger 516 785 465 431 413
afd 0 0 0 1 6
ich 25 55 17 18 23
mal 32 32 17 15 17
pegida 0 0 0 0 58
... ... ... ... ... ...
lied 1 1 0 0 0
gehören 0 0 0 0 0
mist 1 2 1 0 1
terror 0 0 0 0 0
monat 0 3 1 0 0

1000 rows × 5 columns

Normalisieren¶

In [12]:
df_key = df_counts.copy()
df_key.loc[:, 'total']= df_key.sum(axis=1)
In [13]:
df_key = df_key.apply(lambda x: x/x.max(), axis=1)
df_key = df_key.drop(['total'], axis=1)
df_key.head(5).T
Out[13]:
year 2010 2011 2012 2013 2014
wutbürger 0.221745 0.235382 0.200517 0.244747 0.244668
afd 0.000000 0.000000 0.000000 0.000568 0.003555
ich 0.010743 0.016492 0.007331 0.010221 0.013626
mal 0.013752 0.009595 0.007331 0.008518 0.010071
pegida 0.000000 0.000000 0.000000 0.000000 0.034360
... ... ... ... ... ...
lied 0.000430 0.000300 0.000000 0.000000 0.000000
gehören 0.000000 0.000000 0.000000 0.000000 0.000000
mist 0.000430 0.000600 0.000431 0.000000 0.000592
terror 0.000000 0.000000 0.000000 0.000000 0.000000
monat 0.000000 0.000900 0.000431 0.000000 0.000000

999 rows × 5 columns

Berechnung der Keyness mit Chi-Square¶

In [14]:
labels = ['vor 2015' if year < 2015 else 'nach 2015' for year in df_key.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness, _ = chi2(df_key.fillna(0), labels)
# Turn keyness values into a Series, and sort in descending order:
keyness = pd.Series(keyness, index=df_key.columns).sort_values(ascending=False)
In [15]:
keyness.head(15)
Out[15]:
wort          0.129818
s21           0.115413
afd           0.095070
jahr          0.086162
rt            0.084462
wutbürger     0.053477
2010          0.050938
bremen        0.038017
nazi          0.037076
querdenker    0.033656
düringer      0.033302
stuttgart     0.032370
ber           0.029268
noafd         0.027129
mutbürger     0.026170
dtype: float64

Statischer Scatterplot¶

In [16]:
counts = df_counts.copy()
pre_2015 = counts[counts.index < 2015].sum().rank(method='dense', pct=True)
post_2015 = counts[(counts.index > 2014)].sum().rank(method='dense', pct=True)
rankings = pd.DataFrame({'Vor_2015': pre_2015, 'Nach_2015': post_2015})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings['Nach_2015'], y=rankings['Vor_2015'], 
            c=rankings['Vor_2015'] - rankings['Nach_2015'],
            alpha=0.7, cmap='viridis')
for i, row in rankings.loc[keyness.head(25).index].iterrows():
    plt.annotate(i, xy=(row['Nach_2015'], row['Vor_2015']))
plt.xlabel('Rank nach Frequenz nach 2015')
plt.ylabel('Rank nach Frequen vor 2015');

Interaktiver Scatterplot mit Plotly¶

In [17]:
fig = px.scatter(rankings, x='Nach_2015', y='Vor_2015', 
                 hover_name=rankings.index,
                 color=rankings['Vor_2015'] - rankings['Nach_2015'],
                 color_continuous_scale='Viridis', width=750, height=450)
fig.show()

Distinctive Words:¶

Betrachtung 2010 bis 2020¶

vor und nach 2015¶

In [18]:
df_A = df[df.loc[:, 'year'] < 2020]
In [19]:
df_A = df_A.reset_index(drop=True)

Keywords zählen¶

In [20]:
keywords_A = get_top_n_words(df_A.loc[:,'tokens'], 1000)
In [21]:
keywords_A = (list(list(zip(*keywords_A))[0]))
In [22]:
df_counts_A = count_keywords_by(df_A, by='year', keywords=keywords_A)
In [23]:
df_counts_A.head(5).T
Out[23]:
year 2010 2011 2012 2013 2014
wutbürger 516 785 465 431 413
afd 0 0 0 1 6
mal 32 32 17 15 17
pegida 0 0 0 0 58
ich 25 55 17 18 23
... ... ... ... ... ...
gegenrechts 0 0 0 0 0
schüren 0 0 0 1 0
initiative 0 0 3 1 1
les 0 1 1 0 0
normal 0 1 0 1 1

999 rows × 5 columns

Normalisieren¶

In [24]:
df_key_A = df_counts_A.copy()
df_key_A.loc[:, 'total']= df_key_A.sum(axis=1)
In [25]:
df_key_A = df_key_A.apply(lambda x: x/x.max(), axis=1)
df_key_A = df_key_A.drop(['total'], axis=1)
df_key_A.head(5).T
Out[25]:
year 2010 2011 2012 2013 2014
wutbürger 0.217630 0.229734 0.193669 0.235005 0.242229
afd 0.000000 0.000000 0.000000 0.000545 0.003519
mal 0.013496 0.009365 0.007080 0.008179 0.009971
pegida 0.000000 0.000000 0.000000 0.000000 0.034018
ich 0.010544 0.016096 0.007080 0.009815 0.013490
... ... ... ... ... ...
gegenrechts 0.000000 0.000000 0.000000 0.000000 0.000000
schüren 0.000000 0.000000 0.000000 0.000545 0.000000
initiative 0.000000 0.000000 0.001249 0.000545 0.000587
les 0.000000 0.000293 0.000416 0.000000 0.000000
normal 0.000000 0.000293 0.000000 0.000545 0.000587

998 rows × 5 columns

Berechnung der Keyness mit Chi-Square¶

In [26]:
labels_A = ['vor 2015' if year < 2015 else 'nach 2015' for year in df_key_A.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness_A, _ = chi2(df_key_A.fillna(0), labels_A)
# Turn keyness values into a Series, and sort in descending order:
keyness_A = pd.Series(keyness_A, index=df_key_A.columns).sort_values(ascending=False)
In [27]:
keyness_A.head(15)
Out[27]:
afd          0.113709
wort         0.080188
s21          0.060164
jahr         0.051791
rt           0.048140
nazi         0.038770
2010         0.031423
noafd        0.031199
bremen       0.023341
stuttgart    0.020815
besorgen     0.020531
wutbürger    0.020286
düringer     0.019523
amp          0.017205
ber          0.017022
dtype: float64

Statischer Scatterplot¶

In [28]:
counts_A = df_counts_A.copy()
pre_2015_A = counts_A[counts_A.index < 2015].sum().rank(method='dense', pct=True)
post_2015_A = counts_A[(counts_A.index > 2015)].sum().rank(method='dense', pct=True)
rankings_A = pd.DataFrame({'Vor_2015': pre_2015_A, 'Nach_2015': post_2015_A})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings_A['Nach_2015'], y=rankings_A['Vor_2015'], 
            c=rankings_A['Vor_2015'] - rankings_A['Nach_2015'],
            alpha=0.7, cmap='viridis')
for i, row in rankings_A.loc[keyness_A.head(25).index].iterrows():
    plt.annotate(i, xy=(row['Nach_2015'], row['Vor_2015']))
plt.xlabel('Rank nach Frequenz nach 2015')
plt.ylabel('Rank nach Frequen vor 2015');

Interaktiver Scatterplot mit Plotly¶

In [29]:
fig = px.scatter(rankings_A, x='Nach_2015', y='Vor_2015', 
                 hover_name=rankings_A.index,
                 color=rankings_A['Vor_2015'] - rankings_A['Nach_2015'],
                 color_continuous_scale='Viridis', width=750, height=450)
fig.show()
In [30]:
fig = px.scatter(rankings_A, x='Nach_2015', y='Vor_2015', 
                 hover_name=rankings_A.index,
                 color=rankings_A['Vor_2015'] - rankings_A['Nach_2015'],
                 color_continuous_scale='Viridis', width=750, height=450)
fig.show()

Distinctive Words:¶

Betrachtung 2015 bis 2022¶

vor und nach 2020¶

In [31]:
df_B = df[df.loc[:, 'year'] > 2014]
In [32]:
df_B = df_B.reset_index(drop=True)

Keywords zählen¶

In [33]:
keywords_B = get_top_n_words(df_B.loc[:,'tokens'], 1000)
In [34]:
keywords_B = (list(list(zip(*keywords_B))[0]))
In [35]:
df_counts_B = count_keywords_by(df_B, by='year', keywords=keywords_B)
In [36]:
df_counts_B.head(5).T
Out[36]:
year 2015 2016 2017 2018 2019
wutbürger 708 1140 822 1283 883
afd 34 133 130 353 130
mal 31 40 56 96 63
nazi 23 28 23 106 108
pegida 117 70 32 77 34
... ... ... ... ... ...
kritisieren 0 1 1 2 0
chef 0 1 2 3 2
schicken 0 3 0 1 3
wünsche 0 1 1 5 1
verschwörungstheorien 2 2 0 1 0

1000 rows × 5 columns

Normalisieren¶

In [37]:
df_key_B = df_counts_B.copy()
df_key_B.loc[:, 'total']= df_key_B.sum(axis=1)
In [38]:
df_key_B = df_key_B.apply(lambda x: x/x.max(), axis=1)
df_key_B = df_key_B.drop(['total'], axis=1)
df_key_B.head(5).T
Out[38]:
year 2015 2016 2017 2018 2019
wutbürger 0.220629 0.219738 0.206377 0.128274 0.142649
afd 0.010595 0.025636 0.032639 0.035293 0.021002
mal 0.009660 0.007710 0.014060 0.009598 0.010178
nazi 0.007167 0.005397 0.005775 0.010598 0.017447
pegida 0.036460 0.013493 0.008034 0.007698 0.005493
... ... ... ... ... ...
kritisieren 0.000000 0.000193 0.000251 0.000200 0.000000
chef 0.000000 0.000193 0.000502 0.000300 0.000323
schicken 0.000000 0.000578 0.000000 0.000100 0.000485
wünsche 0.000000 0.000193 0.000251 0.000500 0.000162
verschwörungstheorien 0.000623 0.000386 0.000000 0.000100 0.000000

1000 rows × 5 columns

Berechnung der Keyness mit Chi-Square¶

In [39]:
labels_B = ['vor 2020' if year < 2020 else 'nach 2020' for year in df_key_B.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness_B, _ = chi2(df_key_B.fillna(0), labels_B)
# Turn keyness values into a Series, and sort in descending order:
keyness_B = pd.Series(keyness_B, index=df_key_B.columns).sort_values(ascending=False)
In [40]:
keyness_B.head(15)
Out[40]:
querdenker          0.089637
corona              0.036915
massnahmengegner    0.030801
putin               0.029628
ander               0.024669
pegida              0.023717
covidioten          0.023609
faschist            0.023009
schwurbler          0.020800
mainstream          0.019963
nftcommunity        0.015508
rechtsradikaler     0.015052
wutbürger           0.014950
impfgegner          0.014218
protestieren        0.013236
dtype: float64

Statischer Scatterplot¶

In [41]:
counts_B = df_counts_B.copy()
pre_2020_B = counts_B[counts_B.index < 2020].sum().rank(method='dense', pct=True)
post_2020_B = counts_B[(counts_B.index > 2019)].sum().rank(method='dense', pct=True)
rankings_B = pd.DataFrame({'Vor_2020': pre_2020_B, 'Nach_2020': post_2020_B})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings_B['Nach_2020'], y=rankings_B['Vor_2020'], 
            c=rankings_B['Vor_2020'] - rankings_B['Nach_2020'],
            alpha=0.7, cmap='viridis')
for i, row in rankings_B.loc[keyness_B.head(15).index].iterrows():
    plt.annotate(i, xy=(row['Nach_2020'], row['Vor_2020']))
plt.xlabel('Rank nach Frequenz nach 2020')
plt.ylabel('Rank nach Frequen vor 2020');

Interaktiver Scatterplot mit Plotly¶

In [42]:
fig = px.scatter(rankings_B, x='Nach_2020', y='Vor_2020', 
                 hover_name=rankings_B.index,
                 color=rankings_B['Vor_2020'] - rankings_B['Nach_2020'],
                 color_continuous_scale='Viridis', width=750, height=450)
fig.show()

Distinctive Words:¶

Betrachtung 2010 bis 2014¶

und 2020 bis 2022¶

In [43]:
df_C = df[(df.loc[:, 'year'] < 2015) | (df.loc[:, 'year'] > 2019)]
In [44]:
df_C = df_C.reset_index(drop=True)

Keywords zählen¶

In [45]:
keywords_C = get_top_n_words(df_C.loc[:,'tokens'], 1000)
In [46]:
keywords_C = (list(list(zip(*keywords_C))[0]))
In [47]:
df_counts_C = count_keywords_by(df_C, by='year', keywords=keywords_C)
In [48]:
df_counts_C.head(5).T
Out[48]:
year 2010 2011 2012 2013 2014
wutbürger 516 785 465 431 413
wort 196 27 8 11 7
s21 43 92 59 46 19
ich 25 55 17 18 23
jahr 159 35 7 5 5
... ... ... ... ... ...
maskenverweigerer 0 0 0 0 0
leser 1 2 1 0 1
blödsinn 1 0 0 0 0
dummheit 0 1 0 0 1
funktionieren 0 0 0 0 2

999 rows × 5 columns

Normalisieren¶

In [49]:
df_key_C = df_counts_C.copy()
df_key_C.loc[:, 'total']= df_key_C.sum(axis=1)
In [50]:
df_key_C = df_key_C.apply(lambda x: x/x.max(), axis=1)
df_key_C = df_key_C.drop(['total'], axis=1)
df_key_C.head(5).T
Out[50]:
year 2010 2011 2012 2013 2014
wutbürger 0.216625 0.223075 0.183722 0.230975 0.242087
wort 0.082284 0.007673 0.003161 0.005895 0.004103
s21 0.018052 0.026144 0.023311 0.024652 0.011137
ich 0.010495 0.015629 0.006717 0.009646 0.013482
jahr 0.066751 0.009946 0.002766 0.002680 0.002931
... ... ... ... ... ...
maskenverweigerer 0.000000 0.000000 0.000000 0.000000 0.000000
leser 0.000420 0.000568 0.000395 0.000000 0.000586
blödsinn 0.000420 0.000000 0.000000 0.000000 0.000000
dummheit 0.000000 0.000284 0.000000 0.000000 0.000586
funktionieren 0.000000 0.000000 0.000000 0.000000 0.001172

998 rows × 5 columns

Berechnung der Keyness mit Chi-Square¶

In [51]:
labels_C = ['vor 2015' if year < 2020 else 'nach 2015' for year in df_key_C.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness_C, _ = chi2(df_key_C.fillna(0), labels_C)
# Turn keyness values into a Series, and sort in descending order:
keyness_C = pd.Series(keyness_C, index=df_key_C.columns).sort_values(ascending=False)
In [52]:
keyness_C.head(15)
Out[52]:
querdenker          0.085923
s21                 0.060513
wutbürger           0.058647
afd                 0.053915
wort                0.053183
jahr                0.040039
ander               0.038804
rt                  0.036780
corona              0.035873
putin               0.030712
massnahmengegner    0.029417
faschist            0.029006
nazi                0.028433
covidioten          0.023140
schwurbler          0.021320
dtype: float64

Statischer Scatterplot¶

In [53]:
counts_C = df_counts_C.copy()
pre_2015_C = counts_C[counts_C.index < 2015].sum().rank(method='dense', pct=True)
post_2020_C = counts_C[(counts_C.index > 2019)].sum().rank(method='dense', pct=True)
rankings_C = pd.DataFrame({'Vor_2015': pre_2015_C, 'Nach_2015': post_2020_C})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings_C['Nach_2015'], y=rankings_C['Vor_2015'], 
            c=rankings_C['Vor_2015'] - rankings_C['Nach_2015'],
            alpha=0.7, cmap='viridis')
for i, row in rankings_C.loc[keyness_C.head(25).index].iterrows():
    plt.annotate(i, xy=(row['Nach_2015'], row['Vor_2015']))
plt.xlabel('Rank nach Frequenz nach 2020')
plt.ylabel('Rank nach Frequen vor 2015');

Interaktiver Scatterplot mit Plotly¶

In [54]:
fig = px.scatter(rankings_C, x='Nach_2015', y='Vor_2015', 
                 hover_name=rankings_C.index,
                 color=rankings_C['Vor_2015'] - rankings_C['Nach_2015'],
                 color_continuous_scale='Viridis', width=750, height=450)
fig.show()